_base_: ./pretrain_gpt_base.yaml


Engine:
  mix_precision:
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
    use_fp16_guard: False


Generation:
  top_k: 0
  top_p: 0.9
  use_topp_sampling: True
  inference: True
  temperature: 1.0
  min_dec_len: 8
  max_dec_len: 8
  num_return_sequences: 1
  decode_strategy: "sampling"


Model:
  module: GPTGenerationModuleAuto
  vocab_size: 51200
  hidden_size: 4096
  num_layers: 32
  num_attention_heads: 32
  ffn_hidden_size: 16384
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  fuse_attn_qkv: True
  use_fused_dropout_add: False


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
